Forest: FluxNet

Context

Purpose

Explore the FluxNet 2015 (Tier 1) dataset that is open and free for scientific use.

Sensor description

FLUXNET 2015 consists of eddy covariance flux data from several regional networks (FluxNet). The sensors record a number of local variables relating to atmospheric conditions, solar flux and soil moisture.

Highlights

  • A intake catalog is used to fetch the dataset from a public S3 bucket.

  • The fetched dataset contains some 340 sites mostly distributed across Temperate areas.

  • Key variables in carbon flux dynamics are visualised by IGBP land cover class.

Authors

Alejandro Coca-Castro, The Alan Turing Institute, @acocac

Note

The author acknowledges FLUXNET2015 researchers and Pyviz Topics contributors. Some code snippets were extracted from Pyviz’s Carbon Monitoring Project.

Install and load libraries

!pip -q install s3fs
import os
import dask
import numpy as np
import pandas as pd
import intake

import holoviews as hv

import hvplot.pandas
import geoviews.tile_sources as gts

from s3fs import S3FileSystem

import tempfile

pd.options.display.max_columns = 10
hv.extension('bokeh', width=100)

Fetch daily data and metadata

# create a temp dir
path = tempfile.mkdtemp()

catalog_file = os.path.join(path, 'catalog.yaml')

with open(catalog_file, 'w') as f:
    f.write('''
sources:
  fluxnet_daily:
    driver: csv
    parameters:
      s3_path:
        description: Filename to load
        type: str
        default: earth-data/carbon_flux/nee_data_fusion/FLX_AR-SLu_FLUXNET2015_FULLSET_DD_2009-2011_1-3.csv
    cache:
      - argkey: urlpath
        regex: 'earth-data'
        type: file
    args:
      urlpath: "s3://{{ s3_path }}"
      path_as_pattern: 'FLX_{site}_FLUXNET2015_FULLSET_DD_{}.csv'
      csv_kwargs:
        assume_missing: true
        na_values: [-9999]
        parse_dates: ['TIMESTAMP']
      storage_options: {'anon': True}

  fluxnet_metadata:
    driver: csv
    cache:
      - argkey: urlpath
        regex: 'earth-data'
        type: file
    args:
      urlpath: "s3://earth-data/carbon_flux/nee_data_fusion/allflux_metadata.txt"
      csv_kwargs:
        header: null
        names: ['site', 'lat', 'lon', 'igbp', 'network']
        usecols: ['site', 'lat', 'lon', 'igbp']
      storage_options: {'anon': True}
      ''')
cat = intake.open_catalog(catalog_file)
list(cat)
['fluxnet_daily', 'fluxnet_metadata']

Load metadata

# read metadata
metadata = cat.fluxnet_metadata().read()
metadata.sample(5)
site lat lon igbp
158 BE-Lon 50.5516 4.7461 CRO
11 CA-SJ1 53.9080 -104.6560 ENF
3 CA-Cha 45.8847 -67.3569 MF
160 BR-Sa1 -2.8567 -54.9589 EBF
289 US-Ivo 68.4865 -155.7503 WET
# declare a dictionary to map long IGBP classes names
igbp_vegetation = {
    'WAT': '00 - Water',
    'ENF': '01 - Evergreen Needleleaf Forest',
    'EBF': '02 - Evergreen Broadleaf Forest',
    'DNF': '03 - Deciduous Needleleaf Forest',
    'DBF': '04 - Deciduous Broadleaf Forest',
    'MF' : '05 - Mixed Forest',
    'CSH': '06 - Closed Shrublands',
    'OSH': '07 - Open shrublands',
    'WSA': '08 - Woody Savannas',
    'SAV': '09 - Savannas',
    'GRA': '10 - Grasslands',
    'WET': '11 - Permanent Wetlands',
    'CRO': '12 - Croplands',
    'URB': '13 - Urban and Built-up',
    'CNV': '14 - Cropland/Nartural Vegetation Mosaics',
    'SNO': '15 - Snow and Ice',
    'BSV': '16 - Baren or Sparsely Vegetated'
}

from pandas.api.types import CategoricalDtype

dtype = CategoricalDtype(ordered=True, categories=sorted(igbp_vegetation.values()))
metadata['vegetation'] = (metadata['igbp']
                          .apply(lambda x: igbp_vegetation[x])
                          .astype(dtype))
metadata.sample(5)
site lat lon igbp vegetation
316 US-Ton 38.4316 -120.9660 WSA 08 - Woody Savannas
39 US-CPk 41.0680 -106.1187 ENF 01 - Evergreen Needleleaf Forest
77 US-Men 43.0773 -89.4030 WAT 00 - Water
222 DK-ZaH 74.4733 -20.5503 GRA 10 - Grasslands
74 US-LPH 42.5419 -72.1850 DBF 04 - Deciduous Broadleaf Forest
## explore amount of FluxNet sensors in Forest settings
def count_sensors(environment):
    metadata_temp = metadata[metadata['vegetation'].str.contains(environment)]
    print(f'There are some {metadata_temp.vegetation.count()} FluxNet sites in {environment} like environments out of {metadata.vegetation.count()} available sites')
    return metadata_temp

## explore amount of FluxNet sensors
metadata_forest = count_sensors('Forest')
metadata_shrub = count_sensors('Shrublands')
metadata_woody = count_sensors('Woody')
There are some 152 FluxNet sites in Forest like environments out of 340 available sites
There are some 10 FluxNet sites in Shrublands like environments out of 340 available sites
There are some 7 FluxNet sites in Woody like environments out of 340 available sites

Geographical distribution

# visualize across all IGBP types
metadata.hvplot.points('lon', 'lat', geo=True, color='igbp',
                       height=420, width=800, hover_cols=['site', 'vegetation'], cmap='Category20') * gts.OSM

Load daily data

# settings (target variables)
data_columns = ['TIMESTAMP', 'site']
carbon_data_columns = {'NEE_CUT_USTAR50':'NEE','GPP_NT_VUT_REF':'GPP'}

keep_from_csv = data_columns + list(carbon_data_columns.keys())

# helper functions
def season(df, metadata):
    """Add season column based on lat and month
    """
    site = df['site'].cat.categories.item()
    lat = metadata[metadata['site'] == site]['lat'].item()
    if lat > 0:
        seasons = {3: 'spring',  4: 'spring',  5: 'spring',
                   6: 'summer',  7: 'summer',  8: 'summer',
                   9: 'fall',   10: 'fall',   11: 'fall',
                  12: 'winter',  1: 'winter',  2: 'winter'}
    else:
        seasons = {3: 'fall',    4: 'fall',    5: 'fall',
                   6: 'winter',  7: 'winter',  8: 'winter',
                   9: 'spring', 10: 'spring', 11: 'spring',
                  12: 'summer',  1: 'summer',  2: 'summer'}
    return df.assign(season=df.TIMESTAMP.dt.month.map(seasons))


def clean_data(df):
    """
    Clean data columns:

    * add NaN col for missing columns
    * throw away un-needed columns
    * add day of year
    """
    df = df.assign(**{col: np.nan for col in keep_from_csv if col not in df.columns})
    df = df[keep_from_csv]

    df = df.assign(DOY=df.TIMESTAMP.dt.dayofyear)
    df = df.assign(year=df.TIMESTAMP.dt.year)
    df = season(df, metadata)

    return df
# fetch daily data from a public S3 bucket
s3 = S3FileSystem(anon=True)
s3_paths = s3.glob('earth-data/carbon_flux/nee_data_fusion/FLX*')

datasets = []
skipped = []
used = []

for i, s3_path in enumerate(s3_paths):
    dd = cat.fluxnet_daily(s3_path=s3_path).to_dask()
    site = dd['site'].cat.categories.item()

    if not set(dd.columns) >= set(data_columns):
        skipped.append(site)
        continue

    datasets.append(clean_data(dd))
    used.append(site)
data = dask.dataframe.concat(datasets).compute()

data['site'] = data['site'].astype('category')
data = pd.merge(data, metadata, on="site")
data['vegetation'] = data['vegetation'].cat.remove_unused_categories()

Visualising data quality

def mapper(x):
    if x in used:
        return 'valid'
    elif x in skipped:
        return 'skipped'
    else:
        return 'no data'

cmap = {'valid': 'green', 'skipped': 'red', 'no data': 'darkgray'}

QA = metadata.copy()
QA['quality'] = QA['site'].map(mapper)

all_points = QA.hvplot.points('lon', 'lat', geo=True, color='quality',
                              cmap=cmap, hover_cols=['site', 'vegetation'],
                              height=420, width=600).options(tools=['hover', 'tap'],
                                                             legend_position='top')

def veg_count(data):
    veg_count = data['vegetation'].value_counts().sort_index(ascending=False)
    return veg_count.hvplot.barh(height=420, width=500)

hist = veg_count(QA[QA.quality=='valid']).relabel('Vegetation counts for valid sites')

all_points * gts.OSM + hist

Visualising carbon fluxes

# function to plot variable timeseries by IGBP type
def site_timeseries(data, y_variable):
    """Timeseries plot showing the mean carbon flux at each DOY as well as the min and max"""

    tseries = hv.Overlay([
        (data.groupby(['DOY', 'year','vegetation'])[y_variable]
             .mean().groupby(['DOY','vegetation']).agg([np.min, np.max])
             .hvplot.area('DOY', 'amin', 'amax', alpha=0.2, groupby='vegetation')),
        data.groupby(['DOY','vegetation'])[y_variable].mean().hvplot(label=f'Average {carbon_data_columns[y_variable]}',groupby='vegetation')])

    return tseries.options(width=800, height=400, xlabel='DOY',ylabel=f'Daily {carbon_data_columns[y_variable]} (gC/m^2/day)',legend_position='top_left')
# plot daily NEE
timeseries_nee = site_timeseries(data, 'NEE_CUT_USTAR50')
timeseries_nee
WARNING:param.Warning: Nesting DynamicMaps within an Overlay makes it difficult to access your data or control how it appears; we recommend calling .collate() on the Overlay in order to follow the recommended nesting structure shown in the Composing Data user guide (http://goo.gl/2YS8LJ)
# plot daily GPP
timeseries_gpp = site_timeseries(data, 'GPP_NT_VUT_REF')
timeseries_gpp
WARNING:param.Warning: Nesting DynamicMaps within an Overlay makes it difficult to access your data or control how it appears; we recommend calling .collate() on the Overlay in order to follow the recommended nesting structure shown in the Composing Data user guide (http://goo.gl/2YS8LJ)

Summary

TBA